{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import malaya"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# model = malaya.dependency.transformer(model = 'xlnet', quantized = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://ms.wikipedia.org/wiki/Penjodoh_bilangan_bahasa_Melayu\n",
    "penjodoh_bilangan = [\n",
    "    'angkatan',\n",
    "    'baris',\n",
    "    'batang',\n",
    "    'bentuk',\n",
    "    'bidang',\n",
    "    'biji',\n",
    "    'bilah',\n",
    "    'buah',\n",
    "    'buku',\n",
    "    'bungkus',\n",
    "    'butir',\n",
    "    'carik',\n",
    "    'cebis',\n",
    "    'cekak',\n",
    "    'cubit',\n",
    "    'cucuk',\n",
    "    'das',\n",
    "    'deret',\n",
    "    'ekor',\n",
    "    'gugus',\n",
    "    'gelung',\n",
    "    'gemal',\n",
    "    'genggam',\n",
    "    'gulung',\n",
    "    'gumpal',\n",
    "    'helai',\n",
    "    'hidangan',\n",
    "    'hiris',\n",
    "    'ikat',\n",
    "    'jambak',\n",
    "    'jambangan',\n",
    "    'jemput',\n",
    "    'kaki',\n",
    "    'kalung',\n",
    "    'kandang',\n",
    "    'kapur',\n",
    "    'kawan',\n",
    "    'kelompok',\n",
    "    'kepal',\n",
    "    'keping',\n",
    "    'kepul',\n",
    "    'kerat',\n",
    "    'ketul',\n",
    "    'kotak',\n",
    "    'kuntum',\n",
    "    'laras',\n",
    "    'lembar',\n",
    "    'lingkar',\n",
    "    'longgok',\n",
    "    'naskhah',\n",
    "    'orang',\n",
    "    'papan',\n",
    "    'pasang',\n",
    "    'pasukan',\n",
    "    'patah',\n",
    "    'pintu',\n",
    "    'potong',\n",
    "    'pucuk',\n",
    "    'puntung',\n",
    "    'rangkap',\n",
    "    'rawan',\n",
    "    'ruas',\n",
    "    'rumpun',\n",
    "    'sikat',\n",
    "    'sisir',\n",
    "    'suap',\n",
    "    'tandan',\n",
    "    'tangkai',\n",
    "    'teguk',\n",
    "    'timbun',\n",
    "    'titik',\n",
    "    'tongkol',\n",
    "    'ulas',\n",
    "    'untai',\n",
    "    'urat',\n",
    "    'utas',\n",
    "]\n",
    "hubung_list = [\n",
    "    'agar',\n",
    "    'apabila',\n",
    "    'atau',\n",
    "    'bahawa',\n",
    "    'dan',\n",
    "    'hingga',\n",
    "    'jika',\n",
    "    'jikalau',\n",
    "    'kecuali',\n",
    "    'kerana',\n",
    "    'lalu',\n",
    "    'manakala',\n",
    "    'sambil',\n",
    "    'serta',\n",
    "    'semenjak',\n",
    "    'sementara',\n",
    "    'sungguhpun',\n",
    "    'supaya',\n",
    "    'walaupun',\n",
    "    'tetapi',\n",
    "    'berkenan',\n",
    "    'berkenaan',\n",
    "    'yang',\n",
    "    'juga',\n",
    "    'tersebut'\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "sepenjodoh_bilangan = [f'se{w}' for w in penjodoh_bilangan]\n",
    "set_sepenjodoh_bilangan = set(sepenjodoh_bilangan)\n",
    "set_penjodoh_bilangan = set(penjodoh_bilangan)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2037249"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('filtered-dumping-wiki.txt') as fopen:\n",
    "    data = list(filter(None, fopen.read().split('\\n')))\n",
    "    \n",
    "data = [i for i in data if len(i) >= 2]\n",
    "\n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = malaya.preprocessing.TOKENIZER().tokenize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# graph, tagging, indexing = model.predict(' '.join(tokenizer('ibu bapa kita bekerja penat lelah')))\n",
    "# graph.to_graphvis()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import copy\n",
    "import re\n",
    "from malaya.text.regex import _expressions\n",
    "import random\n",
    "\n",
    "def reset_t(tokens):\n",
    "    t = []\n",
    "    for i in range(len(tokens)):\n",
    "        t.append([tokens[i], 2])\n",
    "    return t\n",
    "\n",
    "# [penjodoh bilangan] [kata nama] -> [wrong penjodoh bilangan] [kata nama - kata nama]\n",
    "# dua buah kereta -> dua biji kereta\n",
    "def augment_10_0(t, row):\n",
    "    text, tokens, tokens_lower, penjodoh = row\n",
    "    for word in penjodoh:\n",
    "        try:\n",
    "            i = tokens_lower.index(word)\n",
    "            negate = list(set_penjodoh_bilangan - {word})\n",
    "            choice = random.choice(negate)\n",
    "            t[i][0] = choice\n",
    "            t[i][1] = 10\n",
    "        except Exception as e:\n",
    "            print(e)\n",
    "            pass\n",
    "\n",
    "# se[penjodoh bilangan] [kata nama] -> se[wrong penjodoh bilangan] [kata nama - kata nama]\n",
    "# sebuah kereta -> sebiji kereta\n",
    "def augment_10_1(t, row):\n",
    "    text, tokens, tokens_lower, penjodoh = row\n",
    "    for word in penjodoh:\n",
    "        try:\n",
    "            i = tokens_lower.index(word)\n",
    "            negate = list(set_sepenjodoh_bilangan - {word})\n",
    "            choice = random.choice(negate)\n",
    "            t[i][0] = choice\n",
    "            t[i][1] = 10\n",
    "        except Exception as e:\n",
    "            print(e)\n",
    "            pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1/1 [00:00<00:00, 2032.12it/s]\n"
     ]
    }
   ],
   "source": [
    "results = []\n",
    "for text in tqdm(['2 buah kereta']):\n",
    "    tokens = tokenizer(text)\n",
    "    t = reset_t(tokens)\n",
    "    t_ = copy.deepcopy(t)\n",
    "    tokens_lower = tokenizer(text.lower())\n",
    "    set_tokens = set(tokens_lower)\n",
    "    r_penjodoh_bilangan = set_tokens & set_penjodoh_bilangan\n",
    "    r_sepenjodoh_bilangan = set_tokens & set_sepenjodoh_bilangan\n",
    "    r = (t, tokens, tokens_lower, r_penjodoh_bilangan)\n",
    "    augment_10_0(t_, r)\n",
    "    r = (t, tokens, tokens_lower, r_sepenjodoh_bilangan)\n",
    "    augment_10_1(t_, r)\n",
    "    results.append((t, t_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[([['2', 2], ['buah', 2], ['kereta', 2]],\n",
       "  [['2', 2], ['angkatan', 10], ['kereta', 2]])]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
